Packages

## load required libraries
library(tidyverse)
library(quanteda)
library(lexicon)
library(reshape2)
library(stringi)
library(quanteda.textplots)
library(quanteda.textmodels)
library(quanteda.textstats)
library(gridExtra)
library(seededlda)
library(ggrepel)
library(ggdendro)
library(factoextra)
library(lattice)

Clean workspace and set working directory

## clean workspace
rm(list=ls())
## set working directory (WD)
path <- '~/coliphi21/practice_lessons/lesson_2/src/'
setwd(path)
## check that WD is set correctly
getwd()
## [1] "/Users/lucienbaumgartner/coliphi21/practice_lessons/lesson_2/src"

Import data

For this tutorial you can either work with your own data, or the pre-built copora provided in the /input-folder for the first practice session. The quanteda-package also contains pre-built corpora you can use. For this session, we scraped the Stanford Encyclopedia of Philosophy and built a corpus including additional metadata. If you work with your own data or our other pre-built corpora, this vignette might be helpful.

load('~/coliphi21/practice_lessons/lesson_2/input/stanford-encyclopedia.RDS')

Inspect data

## how does the corpus object look like?
sfe
## Corpus consisting of 1,712 documents and 21 docvars.
## 18thGerman-preKant.json :
## " In Germany, the eighteenth century was the age of enlighten..."
## 
## abduction.json :
## " In the philosophical literature, the term abduction is used..."
## 
## abelard.json :
## " Peter Abelard (1079–21 April 1142) [Abailard or Abaelard or..."
## 
## abhidharma.json :
## " The first centuries after Śākyamuni Buddha death saw the ri..."
## 
## abilities.json :
## " In the accounts we give of one another, claims about our ab..."
## 
## abner-burgos.json :
## " Abner of Burgos (Alfonso de Valladolid; c. 1260–1347) was p..."
## 
## [ reached max_ndoc ... 1,706 more documents ]
## summary statistics
summary(sfe) %>% head
## available variables
docvars(sfe)

Prep

## tokenization
toks <- tokens(sfe, what = 'word',
               remove_punct = T, remove_symbols = T, padding = F, 
               remove_numbers = T, remove_url = T)
## to lower
toks <- tokens_tolower(toks)
## lemmatizing
toks <- tokens_replace(toks, 
                       pattern = lexicon::hash_lemmas$token, 
                       replacement = lexicon::hash_lemmas$lemma)
## remove stopwords
toks <- tokens_select(toks,  pattern = stopwords("en"), selection = "remove")
## remove noise
toks <- tokens_select(toks, pattern = '^[A-z]$|[0-9]+|^.$', valuetype = 'regex', selection = 'remove')
## create dfm
dfm_sfe <- dfm(toks) %>% 
           dfm_trim(min_termfreq = 0.8, termfreq_type = "quantile",
                    max_docfreq = 0.1, docfreq_type = "prop")
dfm_sfe
## Document-feature matrix of: 1,712 documents, 24,689 features (98.47% sparse) and 21 docvars.
##                          features
## docs                      ethos immanuel thomasius pietist thomasians wolff well dis halle pietism
##   18thGerman-preKant.json     2        1        33       6         11    36    0   1    19       7
##   abduction.json              0        0         0       0          0     0    0   0     0       0
##   abelard.json                0        0         0       0          0     0    0   0     0       0
##   abhidharma.json             0        0         0       0          0     0    0   0     0       0
##   abilities.json              0        0         0       0          0     0    0   0     0       0
##   abner-burgos.json           0        0         0       0          0     0    0   0     0       0
## [ reached max_ndoc ... 1,706 more documents, reached max_nfeat ... 24,679 more features ]

Scaling: correspondence analysis

sfe_ca <- textmodel_ca(dfm_sfe)
## coerce model coefficients to dataframe
sfe_ca <- data.frame(dim1 = coef(sfe_ca, doc_dim = 1)$coef_document, 
                     dim2 = coef(sfe_ca, doc_dim = 2)$coef_document)

sfe_ca$id <- gsub('\\.json(\\.[0-9])?', '', rownames(sfe_ca))
sfe_ca
## set seed for plot and subsample (labels)
set.seed(38972)
## plot full data with branch annotation
ggplot(sfe_ca, aes(x=dim1, y=dim2, label=id)) +
  geom_point(aes(color=dim1-dim2), alpha = 0.2) +
  # plot 0.2 of all labels, using a repel function
  geom_text_repel(data = dplyr::sample_frac(sfe_ca, 0.2), max.overlaps = 15, seed = 6734) +
  theme_bw() +
  theme(plot.title = element_text(face='bold')) +
  labs(title = 'Correspondence Analysis: Full Data')
## Warning: ggrepel: 312 unlabeled data points (too many overlaps). Consider increasing max.overlaps

## plot full data with branch annotation
ggplot(sfe_ca, aes(x=dim1, y=dim2, label=id)) +
  geom_point(aes(color=dim1-dim2), alpha = 0.2) +
  # plot 0.2 of all labels, using a repel function
  geom_text_repel(data = dplyr::sample_frac(sfe_ca, 0.2), max.overlaps = 9, seed = 6734) +
  scale_y_continuous(limits=c(-2,0)) +
  scale_x_continuous(limits=c(-1,1)) +
  theme_bw() +
  theme(plot.title = element_text(face='bold')) +
  labs(title = 'Correspondence Analysis: Zoom')
## Warning: Removed 829 rows containing missing values (geom_point).
## Warning: Removed 166 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 127 unlabeled data points (too many overlaps). Consider increasing max.overlaps

Hierarchical clustering

## hierarchical clustering - get distances on normalized dfm
sfe_dist_mat <- dfm_weight(dfm_sfe, scheme = "prop") %>%
    textstat_dist(method = "euclidean") %>% 
    as.dist()
## hiarchical clustering the distance object
sfe_cluster <- hclust(sfe_dist_mat, method = 'ward.D')
# label with document names
sfe_cluster$labels <- gsub('\\.json(\\.[0-9])?', '', docnames(dfm_sfe))
## determine best numbers of clusters
# fviz_nbclust(as.matrix(sfe_dist_mat), FUN = hcut, method = "wss")
## cut tree into two groups
clusters <- cutree(sfe_cluster, k = 4)
## add cluster-data to the correspondence analysis
sfe_ca_hcl <- left_join(sfe_ca, data.frame(cluster = clusters, id = names(clusters)))
## Joining, by = "id"
## plot
ggplot(sfe_ca_hcl, aes(x=dim1, y=dim2, label=id)) +
  geom_point(aes(color=as.factor(cluster)), alpha = 0.2) +
  facet_grid(~as.factor(cluster))

## hierarchical clustering doesn't provide discrete cluster along
## the dimensions of the correspondance analysis

Unsupervised LDA

## run naive unsupervised topic model with 10 topics
set.seed(123)
sfe_lda <- textmodel_lda(dfm_sfe, k = 10)
## print top 20 terms per topic
terms(sfe_lda, 20)
##       topic1          topic2        topic3             topic4          topic5         topic6           topic7       topic8         topic9        topic10    
##  [1,] "supervenience" "spacetime"   "disability"       "dewey"         "gene"         "turing"         "avicenna"   "spinoza"      "pythagoras"  "buddhist" 
##  [2,] "trope"         "einstein"    "luck"             "neural"        "molecular"    "gödel"          "ibn"        "reid"         "african"     "chinese"  
##  [3,] "monism"        "kuhn"        "utilitarianism"   "simulation"    "dna"          "algebra"        "ockham"     "brentano"     "feminism"    "heidegger"
##  [4,] "strawson"      "popper"      "privacy"          "peirce"        "darwin"       "intuitionistic" "averroes"   "malebranche"  "parmenides"  "nietzsche"
##  [5,] "fictional"     "weyl"        "coercion"         "ai"            "fitness"      "ordinal"        "arabic"     "marx"         "pornography" "dao"      
##  [6,] "physicalism"   "reichenbach" "capability"       "user"          "inheritance"  "cardinal"       "bacon"      "berlin"       "pythagorean" "confucian"
##  [7,] "chisholm"      "gravity"     "consequentialist" "fodor"         "cancer"       "tarski"         "maimonides" "wolff"        "oppression"  "buddhism" 
##  [8,] "armstrong"     "ramsey"      "distributive"     "artifact"      "collins"      "computation"    "scotus"     "husserl"      "du"          "japanese" 
##  [9,] "jones"         "hole"        "egalitarian"      "searle"        "biologist"    "hilbert"        "theism"     "herder"       "bois"        "buddha"   
## [10,] "bradley"       "bayesian"    "sidgwick"         "quale"         "whitehead"    "bolzano"        "boethius"   "clarke"       "sextus"      "ritual"   
## [11,] "relativism"    "entropy"     "torture"          "imagery"       "drift"        "φ"              "summa"      "conscience"   "racial"      "mohists"  
## [12,] "internalism"   "bohr"        "dworkin"          "digital"       "biodiversity" "algorithm"      "abelard"    "romantic"     "bce"         "laozi"    
## [13,] "austin"        "newtonian"   "desert"           "empathy"       "payoff"       "algebraic"      "trinity"    "fichte"       "iamblichus"  "zhuangzi" 
## [14,] "noun"          "feyerabend"  "coercive"         "husserl"       "genome"       "recursive"      "islamic"    "schopenhauer" "fr"          "zhu"      
## [15,] "thick"         "bell"        "bentham"          "introspection" "adaptation"   "brouwer"        "christ"     "artist"       "philo"       "li"       
## [16,] "implicature"   "gas"         "voter"            "architecture"  "embryo"       "diagram"        "luther"     "hutcheson"    "sophist"     "confucius"
## [17,] "entailment"    "dynamical"   "income"           "delusion"      "offspring"    "provable"       "eternity"   "schlegel"     "cicero"      "dharma"   
## [18,] "plural"        "bet"         "constitutional"   "goodman"       "replication"  "zfc"            "theistic"   "berkeley"     "proclus"     "wang"     
## [19,] "grice"         "mach"        "nozick"           "hallucination" "replicator"   "computable"     "hartshorne" "du"           "zeno"        "emptiness"
## [20,] "φ"             "metric"      "domination"       "neuroscience"  "ecological"   "cantor"         "theist"     "kants"        "protagoras"  "mencius"
## plot the topics over the correspondence analysis data
sfe_ca$topics <- topics(sfe_lda)
ggplot(sfe_ca, aes(x=dim1, y=dim2, color=topics)) +
  geom_point(alpha = 0.5, shape = '.') +
  geom_density_2d(alpha = 0.5) +
  theme_bw() +
  theme(plot.title = element_text(face='bold')) +
  labs(title = 'Correspondence Analysis with Topic Annotation (k=10)')

Cosine similarities for documents

## subset documents about logic
logic <- dfm_subset(dfm_sfe, grepl('(?<=\\-)logic|logic(?=\\-)', docnames(dfm_sfe), perl = T))
## compute cosine similarity
logic_sim <- textstat_simil(logic, margin = 'document', method = 'cosine')
## all pairs with a cosine similarity > .4
as.data.frame(logic_sim) %>% 
  filter(cosine > .4) %>% 
  arrange(desc(cosine))

Cosine similarities for featues

## subset documents about feminism
fem <- dfm_subset(dfm_sfe, grepl('(?<=\\-)fem|fem.*(?=\\-)', docnames(dfm_sfe), perl = T))
## compute cosine similarities for the features 
## "empowerment", "embodiment", and "rape"
fem_sim <- textstat_simil(logic, logic[, c("empowerment", "embodiment", "rape")], margin = 'feature', method = 'cosine')
## top 5 results per feature
as.data.frame(fem_sim) %>% 
  group_by(feature2) %>% 
  arrange(feature2, desc(cosine)) %>% 
  slice_head(n=5)
 




A work by Lucien Baumgartner & Kevin Reuter